library(tidyverse)
library(janitor)
library(here)
library(hrbrthemes)
library(skimr)
library(kableExtra)
library(DT)
library(outbreaks)
library(linelist)
library(earlyR)
library(incidence)
library(incidence2)
library(i2extras)
library(projections)
library(epicontacts)
library(outbreaker2)
library(ape)
theme_set(theme_minimal())
evd_linelist_raw <- here::here("data", "phm_evd_linelist_2017-11-25.xlsx") %>%
rio::import()
evd_contacts_raw <- here::here("data", "phm_evd_contacts_2017-11-25.xlsx") %>%
rio::import()
evd_cleaning_rules <- here::here("data", "phm_evd_cleaning_rules.xlsx") %>%
rio::import()
evd_linelist_raw
## data.frame [50, 6]
## #case-ID chr 39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## Date Of Onset chr 43018 43024 43025 “18// 10/2017” 43028 43028
## Date.of.report chr 43030 43032 “23/10/2017” 22-10-2017 “2017/10/25” “2017-10-23”
## SEX. chr Female male male male female female
## Age dbl 62 28 54 57 23 66
## Location chr Pseudopolis peudopolis Ankh-Morpork PSEUDOPOLIS Ankh Morpork Pseudopolis
evd_contacts_raw
## data.frame [36, 2]
## Source Case #ID chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## case.id chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe
evd_cleaning_rules
## data.frame [6, 3]
## bad_spelling chr f m .missing am peudopolis .missing
## good_spelling chr female male unknown ankh_morpork pseudopolis unknown
## variable chr sex sex sex location location location
evd_linelist <- evd_linelist_raw %>%
linelist::clean_data(wordlists = evd_cleaning_rules) %>%
mutate(across(contains("date"), guess_dates))
evd_contacts <- evd_contacts_raw %>%
linelist::clean_data()
evd_linelist
## data.frame [50, 6]
## case_id chr 39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## date_of_onset date 2017-10-10 2017-10-16 2017-10-17 2017-10-18 2017-10-20 2017-10-20
## date_of_report date 2017-10-22 2017-10-24 2017-10-23 2017-10-22 2017-10-25 2017-10-23
## sex chr female male male male female female
## age dbl 62 28 54 57 23 66
## location chr pseudopolis pseudopolis ankh_morpork pseudopolis ankh_morpork pseudopolis
evd_contacts
## data.frame [36, 2]
## source_case_id chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## case_id chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe
evd_chains <- make_epicontacts(
linelist = evd_linelist,
contacts = evd_contacts,
id = "case_id",
from = "source_case_id",
to = "case_id",
directed = TRUE
)
evd_chains
##
## /// Epidemiological Contacts //
##
## // class: epicontacts
## // 50 cases in linelist; 36 contacts; directed
##
## // linelist
##
## tibble [50, 6]
## id chr 39e9dc 664549 b4d8aa 51883d 947e40 9aa197
## date_of_onset date 2017-10-10 2017-10-16 2017-10-17 2017-10-18 2017-10-20 2017-10-20
## date_of_report date 2017-10-22 2017-10-24 2017-10-23 2017-10-22 2017-10-25 2017-10-23
## sex chr female male male male female female
## age dbl 62 28 54 57 23 66
## location chr pseudopolis pseudopolis ankh_morpork pseudopolis ankh_morpork pseudopolis
##
## // contacts
##
## tibble [36, 2]
## from chr 9aa197 39e9dc c2a389 51883d 51883d 933811
## to chr 426b6d a8e9d8 95fc1d 778316 e37897 99abbe
plot(
evd_chains,
node_color = "location",
node_shape = "sex",
shapes = c(male = "male", female = "female")
)
It seems that individuals from pseudopolis are the main sources of infection, and almost exclusively infect individuals from ank morpork, with very limited tranmission between individuals from the same location.
get_pairwise(evd_chains, attribute = "location", f = table)
## values.to
## values.from ankh_morpork pseudopolis
## ankh_morpork 13 4
## pseudopolis 16 3
get_pairwise(evd_chains, attribute = "location", f = table) %>%
chisq.test(simulate = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000 replicates)
##
## data: .
## X-squared = 0.34315, df = NA, p-value = 0.6837
Tabulating and calculating a test, it appears that there isn’t a pattern of differential infections, just that individuals from ankh morpork are more likely to be infected.
evd_incidence <- evd_linelist %>%
incidence2::incidence(
date_index = "date_of_onset",
groups = "location"
)
evd_poisson_fit <- evd_incidence %>%
i2extras::fit_curve(model = "poisson")
plot(evd_poisson_fit)
evd_poisson_growth <- i2extras::growth_rate(
evd_poisson_fit,
growth_decay_time = TRUE
)
min_date <- min(evd_linelist$date_of_onset, na.rm = TRUE) %>%
format("%B %d, %Y")
max_date <- max(evd_linelist$date_of_onset, na.rm = TRUE) %>%
format("%B %d, %Y")
evd_poisson_growth %>%
ggplot(aes(x = r, y = fct_reorder(location, r))) +
geom_errorbar(aes(xmin = r_lower, xmax = r_upper), width = 0.2) +
geom_point(color = "#7c1073", size = 3) +
geom_vline(aes(xintercept = 0), linetype = 2, color = "#3b3b3b") +
labs(
title = "Estimated daily growth rate of Ebola in Ankh by region",
subtitle = glue::glue(
"based on symptom onset date, {min_date} - {max_date}"
)
)
Based on both the epidemic curve and the estimated growth rate CIs, it is not possible to tell if the outbreak is growing or shrinking. There is likely insufficient data to make a reliable estimate of the growth rate.
evd_si <- evd_chains %>%
get_pairwise("date_of_onset") %>%
epitrix::fit_disc_gamma()
evd_si
## $mu
## [1] 13.09018
##
## $cv
## [1] 0.6489035
##
## $sd
## [1] 8.494266
##
## $ll
## [1] -122.525
##
## $converged
## [1] TRUE
##
## $distribution
## A discrete distribution
## name: gamma
## parameters:
## shape: 2.37486979332248
## scale: 5.51195897078819
evd_Rt <- incidence::incidence(evd_linelist$date_of_onset) %>%
EpiEstim::estimate_R(
method = "parametric_si",
config = EpiEstim::make_config(mean_si = evd_si$mu, std_si = evd_si$sd)
)
## Default config will estimate R on weekly sliding windows.
## To change this change the t_start and t_end arguments.
## Warning in estimate_R_func(incid = incid, method = method, si_sample = si_sample, : You're estimating R too early in the epidemic to get the desired
## posterior CV.
plot(evd_Rt)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## Warning: Removed 5 row(s) containing missing values (geom_path).
If nothing changes, it seems likely that the outbreak would continue at a relatively steady rate.
evd_dna <- read.FASTA(here::here("data", "phm_evd_wgs.fa"))
evd_dna
## 50 DNA sequences in binary format stored in a list.
##
## All sequences of same length: 18958
##
## Labels:
## 39e9dc
## 664549
## b4d8aa
## 51883d
## 947e40
## 9aa197
## ...
##
## Base composition:
## a c g t
## 0.250 0.248 0.248 0.254
## (Total: 947.9 kb)
identical(labels(evd_dna), evd_linelist$case_id)
## [1] TRUE
neighbour_join <- nj(dist.dna(evd_dna, model = "N"))
neighbour_join
##
## Phylogenetic tree with 50 tips and 48 internal nodes.
##
## Tip labels:
## 39e9dc, 664549, b4d8aa, 51883d, 947e40, 9aa197, ...
##
## Unrooted; includes branch lengths.
neighbour_join <- root(neighbour_join, 1)
plot(neighbour_join, main = "Neighbour Joining tree")
axisPhylo()
Here, our index case is 39e9dc, but we can see a lot of branches of length 0, which could be due to a number of reasons. For example, cases 947e40 and 601d2e could have been infected by an individual outside of out sample, or they could have been infected by 39e9dc and didn’t experience any genetic substitution. As a result of these branches of length 0, it is hard to identify the source of infections for a large number of individuals.
incub_mu <- 10.6
incub_sd <- 7.4
incub_cv <- incub_sd / incub_mu
incub_params <- epitrix::gamma_mucv2shapescale(incub_mu, incub_cv)
evd_incubation <- distcrete::distcrete(
name = "gamma",
shape = incub_params$shape,
scale = incub_params$scale,
w = 0.5,
interval = 1
)
evd_incubation
## A discrete distribution
## name: gamma
## parameters:
## shape: 2.0518626734843
## scale: 5.16603773584906
tibble(days = 0:30, prob = evd_incubation$d(0:30)) %>%
ggplot(aes(x = days, y = prob)) +
geom_col() +
labs(
title = "Incubation period distribution",
x = "Days from infection to onset of symptoms",
y = "Probability"
)
tibble(days = 0:50, prob = evd_si$distribution$d(0:50)) %>%
ggplot(aes(x = days, y = prob)) +
geom_col() +
labs(
title = "Serial Interval distribution",
x = "Days between onset of symptoms in primary and secondary cases",
y = "Probability"
)
outbreaker modeloutbreak_data <- outbreaker_data(
dates = evd_linelist$date_of_onset,
dna = unname(evd_dna),
w_dens = evd_si$distribution$d(1:100),
f_dens = evd_incubation$d(1:100)
)
outbreak_config <- create_config(
move_kappa = FALSE,
move_pi = FALSE,
init_pi = 1,
find_import = FALSE,
init_tree = "star"
)
set.seed(0)
basic_result <- outbreaker(data = outbreak_data, config = outbreak_config)
basic_result
##
##
## ///// outbreaker results ///
##
## class: outbreaker_chains data.frame
## dimensions 201 rows, 158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
##
## /// head //
## data.frame [3, 8]
## step dbl 1 50 100
## post dbl -1959.847683 -814.859065 -818.448079
## like dbl -1962.150168 -817.161595 -820.750622
## prior dbl 2.302485 2.30253 2.302543
## mu dbl 0.0001 0.000055 0.000042
## pi dbl 1 1 1
## eps dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
##
## ...
## /// tail //
## data.frame [3, 8]
## step dbl 9900 9950 10000
## post dbl -806.40595 -805.818206 -807.969237
## like dbl -808.70848 -808.12075 -810.271772
## prior dbl 2.30253 2.302544 2.302535
## mu dbl 0.000055 0.000041 0.00005
## pi dbl 1 1 1
## eps dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
plot(basic_result)
outbreak_plot_type <- c(
"trace", "density", "alpha", "t_inf"
)
map(
.x = outbreak_plot_type,
.f = function(.x) {
if (.x == "density") {
plot(basic_result, "mu", type = .x, burn = 500)
} else {
plot(basic_result, type = .x, burn = 500)
}
}
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
plot(basic_result, , type = "network", burn = 500, min_support = 0.05)
basic_res_summary <- summary(basic_result)
basic_res_summary
## $step
## first last interval n_steps
## 1 10000 50 201
##
## $post
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1959.8 -817.6 -813.1 -819.0 -808.7 -798.9
##
## $like
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1962.2 -819.9 -815.4 -821.3 -811.0 -801.2
##
## $prior
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.302 2.303 2.303 2.303 2.303 2.303
##
## $mu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.400e-05 4.601e-05 5.114e-05 5.123e-05 5.610e-05 1.000e-04
##
## $pi
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
##
## $tree
## from to time support generations
## 1 NA 1 -10 0.2089552 NA
## 2 1 2 -1 0.4577114 1
## 3 1 3 1 0.3830846 1
## 4 1 4 -2 0.3532338 1
## 5 1 5 -4 0.5422886 1
## 6 4 6 4 0.4726368 1
## 7 3 7 7 0.7512438 1
## 8 3 8 6 0.8706468 1
## 9 4 9 3 0.5820896 1
## 10 1 10 -2 0.5074627 1
## 11 1 11 4 0.3532338 1
## 12 1 12 5 0.3930348 1
## 13 4 13 8 0.4975124 1
## 14 6 14 10 0.6666667 1
## 15 5 15 11 0.3781095 1
## 16 4 16 12 0.3233831 1
## 17 6 17 16 0.3134328 1
## 18 6 18 12 0.4726368 1
## 19 6 19 17 0.2835821 1
## 20 6 20 17 0.2636816 1
## 21 6 21 16 0.3233831 1
## 22 6 22 15 0.3383085 1
## 23 4 23 14 0.3034826 1
## 24 9 24 18 0.2587065 1
## 25 6 25 19 0.2039801 1
## 26 8 26 18 0.5920398 1
## 27 6 27 18 0.2487562 1
## 28 33 28 21 0.2935323 1
## 29 9 29 17 0.3134328 1
## 30 8 30 22 0.3582090 1
## 31 13 31 20 0.2338308 1
## 32 30 32 25 0.5820896 1
## 33 28 33 25 0.7014925 1
## 34 20 34 24 0.8308458 1
## 35 8 35 22 0.4875622 1
## 36 8 36 20 0.6119403 1
## 37 27 37 28 0.1691542 1
## 38 34 38 32 0.6567164 1
## 39 11 39 21 0.9552239 1
## 40 14 40 25 0.1990050 1
## 41 15 41 24 0.9552239 1
## 42 3 42 26 0.4975124 1
## 43 41 43 32 0.6417910 1
## 44 35 44 30 0.7910448 1
## 45 22 45 28 0.1691542 1
## 46 30 46 32 0.4278607 1
## 47 34 47 31 0.8358209 1
## 48 40 48 29 0.1890547 1
## 49 32 49 36 0.3333333 1
## 50 40 50 34 0.1741294 1
basic_res_summary$tree %>%
ggplot(aes(x = support)) +
geom_histogram(fill = "grey40", color = "white", bins = 12) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Consesus ancestry: support"
)
## Warning: Removed 2 rows containing missing values (geom_bar).
outbreak_temp_data <- outbreaker_data(
dates = evd_linelist$date_of_onset,
w_dens = evd_si$distribution$d(1:100),
f_dens = evd_incubation$d(1:100)
)
set.seed(0)
basic_temp_result <- outbreaker(
data = outbreak_temp_data, config = outbreak_config
)
basic_temp_result
##
##
## ///// outbreaker results ///
##
## class: outbreaker_chains data.frame
## dimensions 201 rows, 158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
##
## /// head //
## data.frame [3, 8]
## step dbl 1 50 100
## post dbl -365.679658 -300.076884 -295.882138
## like dbl -367.982143 -302.379369 -298.184623
## prior dbl 2.302485 2.302485 2.302485
## mu dbl 0.0001 0.0001 0.0001
## pi dbl 1 1 1
## eps dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
##
## ...
## /// tail //
## data.frame [3, 8]
## step dbl 9900 9950 10000
## post dbl -305.136006 -303.676142 -308.474933
## like dbl -307.438491 -305.978627 -310.777418
## prior dbl 2.302485 2.302485 2.302485
## mu dbl 0.0001 0.0001 0.0001
## pi dbl 1 1 1
## eps dbl 0.5 0.5 0.5
## lambda dbl 0.05 0.05 0.05
plot(basic_temp_result, type = "alpha")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
contact_data <- matrix(
match(
unlist(evd_chains$contacts),
evd_linelist$case_id
),
ncol = 2)
dim(contact_data)
## [1] 36 2
outbreak_cont_data <- outbreaker_data(
dates = evd_linelist$date_of_onset,
dna = unname(evd_dna),
ctd = contact_data,
w_dens = evd_si$distribution$d(1:100),
f_dens = evd_incubation$d(1:100)
)
set.seed(0)
full_result <- outbreaker(
data = outbreak_cont_data, config = outbreak_config
)
full_result
##
##
## ///// outbreaker results ///
##
## class: outbreaker_chains data.frame
## dimensions 201 rows, 158 columns
## ancestries not shown: alpha_1 - alpha_50
## infection dates not shown: t_inf_1 - t_inf_50
## intermediate generations not shown: kappa_1 - kappa_50
##
## /// head //
## data.frame [3, 8]
## step dbl 1 50 100
## post dbl -2133.492486 -879.824167 -842.566586
## like dbl -2135.794972 -882.126696 -844.869121
## prior dbl 2.302485 2.302529 2.302535
## mu dbl 0.0001 0.000056 0.00005
## pi dbl 1 1 1
## eps dbl 0.5 0.58305 0.844773
## lambda dbl 0.05 0.009765 0.002261
##
## ...
## /// tail //
## data.frame [3, 8]
## step dbl 9900 9950 10000
## post dbl -840.629255 -836.36688 -842.49067
## like dbl -842.931782 -838.669416 -844.793195
## prior dbl 2.302527 2.302537 2.302524
## mu dbl 0.000058 0.000049 0.000061
## pi dbl 1 1 1
## eps dbl 0.771107 0.665784 0.596004
## lambda dbl 0.001723 0.001083 0.002275
map(
.x = outbreak_plot_type,
.f = function(.x) {
if (.x == "density") {
plot(full_result, "mu", type = .x, burn = 500)
} else {
plot(full_result, type = .x, burn = 500)
}
}
)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> = "none")` instead.
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
plot(full_result, , type = "network", burn = 500, min_support = 0.05)
full_res_summary <- summary(full_result)
full_res_summary
## $step
## first last interval n_steps
## 1 10000 50 201
##
## $post
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2133.5 -845.4 -841.7 -848.4 -838.0 -831.1
##
## $like
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2135.8 -847.7 -844.0 -850.7 -840.3 -833.4
##
## $prior
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.302 2.303 2.303 2.303 2.303 2.303
##
## $mu
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.195e-05 4.496e-05 5.051e-05 5.085e-05 5.635e-05 1.000e-04
##
## $pi
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 1 1 1 1 1
##
## $tree
## from to time support generations
## 1 NA 1 -8 NA NA
## 2 1 2 1 0.6517413 1
## 3 1 3 1 1.0000000 1
## 4 1 4 -1 0.6616915 1
## 5 1 5 -1 1.0000000 1
## 6 4 6 5 0.6716418 1
## 7 3 7 6 0.9950249 1
## 8 3 8 7 0.9950249 1
## 9 4 9 4 0.9950249 1
## 10 1 10 0 1.0000000 1
## 11 1 11 6 0.9950249 1
## 12 1 12 3 0.9900498 1
## 13 4 13 10 0.9950249 1
## 14 6 14 9 0.9950249 1
## 15 1 15 9 1.0000000 1
## 16 4 16 11 0.9950249 1
## 17 14 17 16 0.9950249 1
## 18 6 18 16 0.4228856 1
## 19 6 19 15 0.9950249 1
## 20 6 20 15 0.9900498 1
## 21 14 21 16 0.9950249 1
## 22 6 22 18 0.2537313 1
## 23 4 23 12 0.9900498 1
## 24 4 24 13 0.9950249 1
## 25 6 25 21 0.2437811 1
## 26 3 26 16 0.9900498 1
## 27 21 27 23 0.9950249 1
## 28 6 28 17 0.9950249 1
## 29 23 29 21 0.9950249 1
## 30 8 30 19 0.6169154 1
## 31 23 31 23 0.9950249 1
## 32 30 32 27 0.9950249 1
## 33 28 33 26 0.9950249 1
## 34 20 34 24 0.9950249 1
## 35 3 35 20 0.9950249 1
## 36 3 36 22 0.9950249 1
## 37 22 37 29 0.1641791 1
## 38 34 38 32 0.9900498 1
## 39 11 39 24 0.9950249 1
## 40 22 40 27 0.8805970 1
## 41 15 41 22 0.9950249 1
## 42 3 42 24 0.9950249 1
## 43 41 43 32 0.9900498 1
## 44 35 44 31 0.9950249 1
## 45 18 45 28 0.1641791 1
## 46 30 46 30 0.9900498 1
## 47 34 47 33 0.9950249 1
## 48 6 48 21 0.9950249 1
## 49 30 49 33 0.9900498 1
## 50 48 50 35 0.1691542 1
full_res_summary$tree %>%
ggplot(aes(x = support)) +
geom_histogram(fill = "grey40", color = "white", bins = 12) +
scale_x_continuous(limits = c(0, 1)) +
labs(
title = "Consesus ancestry: support"
)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
full_res_summary$tree$support <- round(full_res_summary$tree$support, 2)
evd_linelist$id <- 1:nrow(evd_linelist)
cons_tree <- make_epicontacts(
evd_linelist,
full_res_summary$tree[-1, ],
id = "id",
from = 1,
to = 2,
directed = TRUE
)
support_pal <- colorRampPalette(
c("#918D98", "#645877", "#423359", "#281449", "#1A0340")
)
age_pal <- colorRampPalette(
c("#3288BD", "#ABDDA4", "#FDAE61", "#D53E4F")
)
cons_tree$linelist$age_class <- cut(
cons_tree$linelist$age,
breaks = c(0, 10, 20, 30, 40, 90),
labels = c("0-10", "11-20", "21-30", "31-40", "41+")
)
plot(
cons_tree,
node_color = "age_class",
node_shape = "sex",
shapes = c(male = "male", female = "female")
)